package helpers;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import debugs.PageDebug;

import entities.Page;

public class CrawlerHelper {

	public static Page getContent(String url) {
		Page page = new Page();
		try {
			Document doc = Jsoup.connect(url).followRedirects(true).get();
			Elements elms = doc.select(".page .middleSecPage");
			elms.select("#tabs,#about, #advantages, script").remove();
			elms.select(".AdsBottom336X280, .topicNavigations, .postcommentshow_tutorial").remove();
			
			Elements elmsTitle = elms.select(".pageTopicSec");
			elmsTitle.select("br, span").remove();
			String title = elmsTitle.text();
			
			String content = elms.html();
			content = content.replace("http://www.roseindia.net", "");
			content = content.replace("http://roseindia.net", "");
			content = content.replace("roseindia", "JavaMonday");
			
			String tags = elms.select("p.relatedtagsshow a").toString();
			tags = tags.replaceAll("[<]{1}(a)[^>]+[>]{1}", "").replaceAll("<\\/a>", ",");
			
			String bredcrum = doc.select(".page .bredcrum").html();
			
			page.setTitle(title);
			page.setContent(content);
			page.setKeywords(tags);
			page.setBredcrum(bredcrum);
			
		} catch (Exception e) {
			// TODO: handle exception
		}
		return page;
	}
	
	public static Page getTag(String url) {
		Page page = new Page();
		
		try {
			Document doc = Jsoup.connect(url).followRedirects(true).get();
			Elements elms = doc.select("div table tr:eq(1) td table");
			
			String content = "<table>" + elms.html() + "</table>";
			content = content.replace("http://www.roseindia.net", "");
			content = content.replace("http://roseindia.net", "");
			content = content.replace("roseindia", "JavaMonday");
			content = content.replace(".htm\"", ".shtml\"");
			
			page.setContent(content);
			
		} catch (Exception e) {
			// TODO: handle exception
		}
		
		return page;
	}
}
